Yelp Explorartory Data Analysis

In [30]:
import numpy as np
import pandas as pd
import plotly
import cufflinks as cf
import seaborn as sns
plotly.tools.set_credentials_file(username='DesciuitV',api_key='Q1xpEcxmp80nvR9Cho16')
import plotly.plotly as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode,iplot
import plotly.io as pio
from ipywidgets import interactive, HBox, VBox, widgets, interact

init_notebook_mode(connected=True)
#cf.set_config_file(theme='ggplot')
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')
Out[30]:
In [72]:
df1 = pd.read_csv("business.csv",dtype={'business_id': str})
df2 = pd.read_csv("review.csv",dtype={'business_id': str, 'review_id': str,'user_id':str})
In [32]:
df1.dropna(subset=['categories'],inplace=True)
In [33]:
cat = "Restaurants|Ice Cream & Frozen Yogurt|Food|Coffee & Tea|Desserts|Pizza|Bars|Fast Food|FastFood|Sandiwches|Bakeries|Delis|Breakfast|Pub|Bars|Cafe"
nocat = "Grocery|Spa|Nail Salons|Beauty & Spas|Barbers|Hair Salons|Tattoo|Hair|Nails|Auto|Automotive|Auto Repair|Car Rental|Car Dealers|Real Estate|Apartments|Home Services|Pest Control|Self StorageBanks|Credit Unions|Financial ServicesVeterinarians|Pets|Pet StoresHealth & Medical|Doctors|Health|DentistsLocal Services|Laundry|Libraries|Repair|Cinema|Arts|Public Services|Government|Professional|Religious Organizations|ChurchesProfessional ServicesShoppingActive Life|Yoga|Golf|Swimming Pools|Hiking|Amusement Parks|Gyms|Education|Mass Media|Radio Stations|Local Flavor|Guns|Tax|Magic|Boat|Tours|Watches|Bike"

df1.loc[(df1['categories'].str.contains(cat)) & (~df1['categories'].str.contains(nocat)),'categories'] = 'Restaurants/Food'
In [34]:
df1.loc[(df1['categories'].str.contains("Grocery")) & (df1['categories'].str.contains("Food")),'categories']  = 'Supermarket/ConvenienceStores'
df1.loc[df1['categories'].str.contains("Nail Salons|Beauty & Spas|Barbers|Hair Salons|Tattoo|Hair|Nails"),'categories'] = 'Beauty/Spa Services'
df1.loc[df1['categories'].str.contains("Auto|Automotive|Auto Repair|Car Rental|Car Dealers"),'categories'] = 'Car Services'
df1.loc[df1['categories'].str.contains("Real Estate|Apartments|Home Services|Pest Control|Self Storage"),'categories'] = 'Real Estate & Home Services'
df1.loc[df1['categories'].str.contains("Banks|Credit Unions|Financial Services"),'categories'] = 'Financial Services'
df1.loc[df1['categories'].str.contains("Veterinarians|Pets|Pet Stores"),'categories'] = 'Pet Stores/Services'
df1.loc[df1['categories'].str.contains("Hotels|Event|Travel"),'categories'] = 'Hotels & Event Planning'
df1.loc[df1['categories'].str.contains("Health & Medical|Doctors|Health|Dentists"),'categories'] = 'Health & Medical Services'
df1.loc[df1['categories'].str.contains("Local Services|Laundry|Libraries|Repair|Cinema|Arts|Public Services|Government|Professional|Religious Organizations|Churches"),'categories'] = 'Local/Government Services'
df1.loc[df1['categories'].str.contains("Professional Services"),'categories'] = 'Professional Services'
df1.loc[df1['categories'].str.contains("Shopping"),'categories'] = 'Shopping'
df1.loc[df1['categories'].str.contains("Active Life|Yoga|Golf|Swimming Pools|Hiking|Amusement Parks|Gyms"),'categories'] = 'Active Life'
df1.loc[df1['categories'].str.contains("Education"),'categories'] = 'Education'
df1.loc[df1['categories'].str.contains("Night Life|Karaoke|NightLife|Nightlife"),'categories'] = 'Night Life'
df1.loc[df1['categories'].str.contains("Mass Media|Radio Stations"),'categories'] = 'Mass Media/Radio Stations'
df1.loc[df1['categories'].str.contains("Local Flavor"),'categories'] = 'Local Flavor'
In [35]:
dfa = df1['categories'].value_counts()[:10].sort_values(ascending=True)
label = dfa.index
size = dfa.values

trace = go.Bar(x=size,y=label,marker=dict(color=size,colorscale='Jet',showscale=False,reversescale=True)
               ,text=size,textposition='auto',orientation='h')

data= [trace]
layout = go.Layout(title='Top 10 Categories in Yelp',yaxis=dict(title='Categories',automargin=True),
                  xaxis=dict(title="Number of Businesses"),showlegend=False)

fig=go.Figure(data=data,layout=layout)
pio.write_image(fig, 'fig1.svg')
plotly.offline.iplot(fig)
In [36]:
dfr = df1[df1.categories.str.contains('Restaurants/Food')]
In [37]:
dfr = dfr.drop(['Unnamed: 0','address','attributes','hours'],axis=1)
dfr = dfr.dropna(subset=['postal_code'],axis=0)
In [38]:
dfr.city=dfr.city.astype(str).str.upper()
In [39]:
dfr.loc[dfr['city'].str.contains('MONTRÃ|MONTRE|MONTéAL'),'city'] = 'MONTREAL'
In [40]:
dfr.loc[dfr['city'].str.contains('TORO|TORN'),'city'] = 'TORONTO'
In [41]:
dfr.loc[dfr['city'].str.contains('MISSISSAUGA|MISSISAUGA|MISSISSUAGA'),'city'] = 'MISSISSAUGA'
In [42]:
dfr.loc[dfr['city'].str.contains('LAS V|LASV|LAS  VEGAS'),'city'] = 'LAS VEGAS'
In [43]:
dfr.loc[dfr['city'].str.contains('YORK'),'city'] = 'YORK'
In [44]:
dfr.loc[dfr['city'].str.contains('CALGARY'),'city'] = 'CALGARY'
In [45]:
# Function to format numbers to have comma thousands seperator
def thousands_format(x):
    return list(map('{:,d}'.format,x))
In [46]:
dfr1 = dfr.groupby(['city'])['review_count'].sum().sort_values(ascending=False)[:15].sort_values(ascending=True)
label = dfr1.index
size = dfr1.values

trace = go.Bar(x=size,y=label,marker=dict(color=size,colorscale='Jet',showscale=False,reversescale=True)
               ,text=thousands_format(size),textposition='auto',orientation='h')

data= [trace]
layout = go.Layout(title='Total Number of Reviews In Each City',yaxis=dict(title='City',automargin=True),
                  xaxis=dict(title="Number of Reviews",automargin=True),showlegend=False,width=1050,height=700)

fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
In [47]:
t = dfr[dfr['city']=='TORONTO']
mt = dfr[dfr['city']=='MONTREAL']
lv = dfr[dfr['city']=='LAS VEGAS']
p = dfr[dfr['city']=='PHOENIX']

TORONTO

In [48]:
dfr1 = t['stars'].value_counts()
label = dfr1.index
size = dfr1.values

trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
               ,text=size,textposition='auto',orientation='v',opacity=0.7)

data= [trace]
layout = go.Layout(title='Star Rating Distribution in Toronto',font=dict(size=16,color='black'),
                   yaxis=dict(title='# of Businesses',automargin=True),
                   xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)

fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
In [49]:
stars = sorted(t['stars'].unique())

All Restaurants in Toronto!

In [50]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"

my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
  for name, stars, revs in zip(list(t['name']), list(t['stars']),
                           list(t['review_count'])) ] 

data = [
    go.Scattermapbox(
        lat=t['latitude'],
        lon=t['longitude'],
        mode='markers',
        marker=dict(size=6, color='gold', opacity=.5),
        text=my_text
    )
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    width=1050,
    height=750,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=70,
        center=dict(
            lat=43.6512775,
            lon=-79.38878,
        ),
        pitch=70,
        zoom=14,
        style=map_style,
    
    ),
)

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)

Slide that bar (:

In [51]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"
        
#make figure
steps=[]
figure = {'data':[],
         'layout':{},
         'frames':[]
         }

figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':43.671094,'lon':-79.3874455},
                             'pitch':60,'zoom':12,'style':map_style}

figure['layout']['sliders'] = {
    'args': ["transition", {
            'duration': 1000,
            'easing': 'cubic-in-out'
        }
    ],
    # cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
    'initialValue': str(stars[0]),
    'plotlycommand': 'animate',
    'values':str(stars[0]),
    'visible': True
}

figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 1000, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,       #determines which button is considered active
    'steps': [],
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Stars:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration':300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0
}

#
for star in stars:
    slider_step = {'args': [
            [str(star)],
            {'frame': {'duration': 1000, 'redraw': False},
             'mode': 'immediate',
           'transition': {'duration': 300}} 
        ],
         'label':str(star),'value':str(star),
         'method': 'animate'}
    
    sliders_dict['steps'].append(slider_step)
    

#make data
for star in stars[:1]:
    t_by_stars = t[t['stars']==star]
    data_dict = {
            'type':'scattermapbox',
            'lat': list(t_by_stars['latitude']),
            'lon': list(t_by_stars['longitude']),
            'mode':'markers',
            'marker':{'size':6,'color':'gold','opacity':0.5},
            'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))] 
            }
    figure['data'].append(data_dict)

#make frames
frames=[]
for star in stars:

    #frame = {'data':[],'name':str(star)}
    t_by_stars = t[t['stars']==star]
    
    data_dict = {
        'type':'scattermapbox',
        'lat': list(t_by_stars['latitude']),
        'lon': list(t_by_stars['longitude']),
        'mode':'markers',
        'marker':{'size':6,'color':'gold','opacity':0.5},
        'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))] 
        }
    frame = {'data':[data_dict],'name':str(star)}
    #frame['data'].append(data_dict)
    frames.append(frame)
    #figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars

figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]

plotly.offline.iplot(figure)
#iplot(figure)

Montreal

In [52]:
dfr1 = mt['stars'].value_counts()
label = dfr1.index
size = dfr1.values

trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
               ,text=size,textposition='auto',orientation='v',opacity=0.7)

data= [trace]
layout = go.Layout(title='Star Rating Distribution in Montreal',font=dict(size=16,color='black'),
                   yaxis=dict(title='# of Businesses',automargin=True),
                   xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)

fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

ALl Restaurants in Montreal

In [53]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"

my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
  for name, stars, revs in zip(list(mt['name']), list(mt['stars']),
                           list(mt['review_count'])) ] 
data = [
    go.Scattermapbox(
        lat=mt['latitude'],
        lon=mt['longitude'],
        mode='markers',
        marker=dict(
            size=6,
            color='gold',
            opacity=.5,
        ),
        text=my_text
    )
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    width=1050,
    height=750,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=45.489505,
            lon=-73.599207,
        ),
        pitch=60,
        zoom=12,
        style=map_style,
    
    ),
)

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)

Keep slidin' it

In [54]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"

stars = sorted(mt['stars'].unique())
#make figure
steps=[]
figure = {'data':[],
         'layout':{},
         'frames':[]
         }

figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':45.489505,'lon':-73.599207},
                             'pitch':60,'zoom':12,'style':map_style}

figure['layout']['sliders'] = {
    'args': ["transition", {
            'duration': 1000,
            'easing': 'cubic-in-out'
        }
    ],
    # cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
    'initialValue': str(stars[0]),
    'plotlycommand': 'animate',
    'values':str(stars[0]),
    'visible': True
}

figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 1000, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,       #determines which button is considered active
    'steps': [],
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Stars:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration':300, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0
}

#
for star in stars:
    slider_step = {'args': [
            [str(star)],
            {'frame': {'duration': 1000, 'redraw': False},
             'mode': 'immediate',
           'transition': {'duration': 300}} 
        ],
         'label':str(star),'value':str(star),
         'method': 'animate'}
    
    sliders_dict['steps'].append(slider_step)
    

#make data
for star in stars[:1]:
    mt_by_stars = mt[mt['stars']==star]
    data_dict = {
            'type':'scattermapbox',
            'lat': list(mt_by_stars['latitude']),
            'lon': list(mt_by_stars['longitude']),
            'mode':'markers',
            'marker':{'size':6,'color':'gold','opacity':0.5},
            'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))] 
            }
    figure['data'].append(data_dict)

    
#make frames
frames=[]
for star in stars:
    
    #frame = {'data':[],'name':str(star)}
    mt_by_stars = mt[mt['stars']==star]
    
    data_dict = {
        'type':'scattermapbox',
        'lat': list(mt_by_stars['latitude']),
        'lon': list(mt_by_stars['longitude']),
        'mode':'markers',
        'marker':{'size':6,'color':'gold','opacity':0.5},
        'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(t_by_stars['name']), list(t_by_stars['review_count']))] 
    }
    frame = {'data':[data_dict],'name':str(star)}
    #frame['data'].append(data_dict)
    frames.append(frame)
    #figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars

figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]

plotly.offline.iplot(figure)
#iplot(figure)

LAS VEGAS

In [55]:
dfr1 = lv['stars'].value_counts()
label = dfr1.index
size = dfr1.values

trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Viridis',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
               ,text=size,textposition='auto',orientation='v',opacity=0.7)

data= [trace]
layout = go.Layout(title='Star Rating Distribution in Las Vegas',font=dict(size=16,color='black'),
                   yaxis=dict(title='# of Businesses',automargin=True),
                   xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)

fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)

LAS VEGAS IS LIT!

In [56]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"


my_text=['Name:' + name +'<br>Stars:' + str(stars) +'<br>Number of Reviews:' + str(revs)
  for name, stars, revs in zip(list(lv['name']), list(lv['stars']),
                           list(lv['review_count'])) ] 
data = [
    go.Scattermapbox(
        lat=lv['latitude'],
        lon=lv['longitude'],
        mode='markers',
        marker=dict(
            size=6,
            color='gold',
            opacity=.5,
        ),
        text=my_text,
    )
]

layout = go.Layout(
    autosize=True,
    hovermode='closest',
    width=1050,
    height=750,
    mapbox=dict(
        accesstoken=mapbox_access_token,
        bearing=0,
        center=dict(
            lat=36.110926,
            lon=-115.173089,
        ),
        pitch=60,
        zoom=14,
        style=map_style,
    
    ),
)

fig = dict(data=data, layout=layout)

plotly.offline.iplot(fig)

Yeeeeeeeeeeep!

In [57]:
mapbox_access_token = 'pk.eyJ1IjoiZGVzY2l1aXR2IiwiYSI6ImNqcnZ1OWZ6MjA1eDYzeXBpOG5sZWd1NGcifQ.GQV-bvZFT62xzUBu2d4s6g'
map_style = "mapbox://styles/desciuitv/cjrwd4gn215au1ftfaphiuhhl"

stars = sorted(lv['stars'].unique())
#make figure
steps=[]
figure = {'data':[],
         'layout':{},
         'frames':[]
         }

figure['layout']['autosize'] = True
figure['layout']['hovermode'] ='closest'
figure['layout']['width'] = 1050
figure['layout']['height'] = 750
figure['layout']['mapbox'] = {'accesstoken':mapbox_access_token,'bearing':0,'center':{'lat':36.110926,'lon':-115.173089},
                             'pitch':60,'zoom':11,'style':map_style}

figure['layout']['sliders'] = {
    'args': ["transition", {
            'duration': 1000,
            'easing': 'cubic-in-out'
        }
    ],
    # cast it to string because javascript doesn't have separate interger/float datatypes. As such, we cast it as string in Python
    'initialValue': str(stars[0]),
    'plotlycommand': 'animate',
    'values':str(stars[0]),
    'visible': True
}

figure['layout']['updatemenus'] = [
    {
        'buttons': [
            {
                'args': [None, {'frame': {'duration': 1000, 'redraw': False},
                         'fromcurrent': True, 'transition': {'duration': 300, 'easing': 'cubic-in-out'},'mode':'immediate'}],
                'label': 'Play',
                'method': 'animate'
            },
            {
                'args': [[None], {'frame': {'duration': 0, 'redraw': False}, 'mode': 'immediate',
                'transition': {'duration': 0}}],
                'label': 'Pause',
                'method': 'animate'
            }
        ],
        'direction': 'left',
        'pad': {'r': 10, 't': 87},
        'showactive': False,
        'type': 'buttons',
        'x': 0.1,
        'xanchor': 'right',
        'y': 0,
        'yanchor': 'top'
    }
]

sliders_dict = {
    'active': 0,       #determines which button is considered active
    'steps': [],
    'yanchor': 'top',
    'xanchor': 'left',
    'currentvalue': {
        'font': {'size': 20},
        'prefix': 'Stars:',
        'visible': True,
        'xanchor': 'right'
    },
    'transition': {'duration':200, 'easing': 'cubic-in-out'},
    'pad': {'b': 10, 't': 50},
    'len': 0.9,
    'x': 0.1,
    'y': 0
}

#
for star in stars:
    slider_step = {'args': [
            [str(star)],
            {'frame': {'duration': 1000, 'redraw': False},
             'mode': 'immediate',
           'transition': {'duration': 300}} 
        ],
         'label':str(star),'value':str(star),
         'method': 'animate'}
    
    sliders_dict['steps'].append(slider_step)

#make data
for star in stars[:1]:
    lv_by_stars = lv[lv['stars']==star]
    data_dict = {
            'type':'scattermapbox',
            'lat': list(lv_by_stars['latitude']),
            'lon': list(lv_by_stars['longitude']),
            'mode':'markers',
            'marker':{'size':6,'color':'gold','opacity':0.5},
            'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(lv_by_stars['name']), list(lv_by_stars['review_count']))] 
            }
    figure['data'].append(data_dict)

#make frames
frames=[]
for star in stars:
    
    #frame = {'data':[],'name':str(star)}
    lv_by_stars = lv[lv['stars']==star]
    
    data_dict = {
        'type':'scattermapbox',
        'lat': list(lv_by_stars['latitude']),
        'lon': list(lv_by_stars['longitude']),
        'mode':'markers',
        'marker':{'size':6,'color':'gold','opacity':0.5},
        'text':['Name:' + name + '<br>Number of Reviews:' + str(revs) 
             for name, revs in zip(list(lv_by_stars['name']), list(lv_by_stars['review_count']))] 
        }
    frame = {'data':[data_dict],'name':str(star)}
    #frame['data'].append(data_dict)
    frames.append(frame)
    #figure['frames'].append(frame) <- avoid this cause we want the list to contain only a single trace.. this has all the trace across all stars

figure['frames'] = frames
figure['layout']['sliders']=[sliders_dict]

plotly.offline.iplot(figure)
#iplot(figure)
In [59]:
df2.drop(columns=['Unnamed: 0'],inplace=True)
In [61]:
import re
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from tqdm import tqdm, tqdm_pandas
import warnings
warnings.filterwarnings("ignore")
In [62]:
dfn = pd.merge(df2,df1[['business_id','name','city','state']], on='business_id',how='right')

After merging Business and Reviews dataset...

There are 192,127 unique businesses.
We have 6,683,763 restaurant reviews by 1,636,547 unique users!

In [ ]:
def percentile(n):
    def percentile_(x):
        return np.percentile(x, n)
    percentile_.__name__ = 'percentile_%s' % n
    return percentile_
In [66]:
dfn.dropna(inplace=True)
In [68]:
dp = dfn['stars'].value_counts()
label = dp.index
size = dp.values

trace = go.Bar(x=label,y=size,marker=dict(color=size,colorscale='Portland',showscale=False,reversescale=True,line=dict(color='rgb(8,48,107)',width=2))
               ,text=thousands_format(size),textposition='auto',orientation='v',opacity=0.7)

data= [trace]
layout = go.Layout(title='Star Rating Distribution for all Reviews',font=dict(size=16,color='black'),
                   yaxis=dict(title='# of Businesses',automargin=True),
                   xaxis=dict(title="Star Ratings",automargin=True),showlegend=False,width=1050,height=700)

fig=go.Figure(data=data,layout=layout)
plotly.offline.iplot(fig)
In [69]:
dfn['text_length'] = dfn['text'].apply(len)
In [70]:
plt.style.use('fivethirtyeight')
plt.figure(figsize=(13,13))
sns.boxplot(x='stars',y='text_length',data=dfn,palette='Accent')
#sns.boxplot(y='stars',x='text_length',data=dfm,palette='Accent')
plt.ylabel("Length of text", fontsize=17)
plt.xlabel("Stars",fontsize=17)
plt.title("Distribution of text length for star ratings")
plt.show()
In [71]:
dfnn = dfn.groupby('stars')['text_length'] \
       .agg([('Mean',np.mean),('Median',np.median),('25th Percentile',lambda x: np.percentile(x,q = 25))]) \
       .reset_index()
print(dfnn)
   stars        Mean  Median  25th Percentile
0    1.0  762.697060     542              300
1    2.0  765.770812     573              322
2    3.0  713.849289     542              303
3    4.0  627.339078     461              254
4    5.0  483.292364     341              200

The average text length for NEGATIVE reviews are longer than POSITIVE reviews!

In [106]:
df2=pd.read_csv('term_freq.csv',header=None, skiprows=1,index_col=0)
In [107]:
df2.columns = ['negative', 'positive']
df2['total'] = df2['negative'] + df2['positive']
In [98]:
%%time
vect1 = CountVectorizer(stop_words='english',max_features=10000)
vect1.fit(df.reviews)
Wall time: 3min 4s
In [99]:
%%time
document_matrix = vect1.transform(df.reviews)
Wall time: 2min 59s
In [100]:
(document_matrix).shape
Out[100]:
(3085663, 10000)
In [101]:
%%time
neg_batches = np.linspace(0,1542841,10).astype(int)
i=0
neg_tf = []
while i < len(neg_batches)-1:
    batch = np.sum(document_matrix[neg_batches[i]:neg_batches[i+1]].toarray(),axis=0)
    neg_tf.append(batch)
    print(neg_batches[i+1],"entries' term frequency calculated")
    i += 1
171426 entries' term frequency calculated
342853 entries' term frequency calculated
514280 entries' term frequency calculated
685707 entries' term frequency calculated
857133 entries' term frequency calculated
1028560 entries' term frequency calculated
1199987 entries' term frequency calculated
1371414 entries' term frequency calculated
1542841 entries' term frequency calculated
Wall time: 53.6 s
In [102]:
%%time
pos_batches = np.linspace(1542842,3085662,10).astype(int)
i=0
pos_tf = []
while i < len(pos_batches)-1:
    batch = np.sum(document_matrix[pos_batches[i]:pos_batches[i+1]].toarray(),axis=0)
    pos_tf.append(batch)
    print(pos_batches[i+1],"entries' term frequency calculated")
    i += 1
1714266 entries' term frequency calculated
1885690 entries' term frequency calculated
2057115 entries' term frequency calculated
2228539 entries' term frequency calculated
2399964 entries' term frequency calculated
2571388 entries' term frequency calculated
2742813 entries' term frequency calculated
2914237 entries' term frequency calculated
3085662 entries' term frequency calculated
Wall time: 53.9 s
In [103]:
#neg_mat = vect1.transform(df[df.target == 0].reviews)
#pos_mat = vect1.transform(df[df.target == 1].reviews)
neg = np.sum(neg_tf,axis=0)
pos = np.sum(pos_tf,axis=0)
tf = pd.DataFrame([neg,pos],columns=vect1.get_feature_names()).transpose()
tf.columns = ['negative', 'positive']
tf['total'] = tf['negative'] + tf['positive']
In [108]:
plt.figure(figsize=(12,12))
plt.style.use('fivethirtyeight')

negwords = tf.sort_values(by='negative', ascending=False)['negative'][:50].sort_values()
y_pos = np.arange(50)

plt.barh(y_pos, negwords, color='brown',alpha=0.7)

plt.yticks(y_pos, negwords.index)
plt.ylabel('Negative Tokens')
plt.xlabel('Frequency')
plt.title('Top 50 (negative) tokens')

plt.show()
In [109]:
plt.figure(figsize=(12,12))
plt.style.use('fivethirtyeight')

poswords = df2.sort_values(by='positive', ascending=False)['positive'][:50].sort_values()
y_pos = np.arange(50)

plt.barh(y_pos, poswords, color='green',alpha=0.7)

plt.yticks(y_pos, poswords.index)
plt.ylabel('Positive Tokens')
plt.xlabel('Frequency')
plt.title('Top 50 (Positive) tokens')

plt.show()

Relationship between Negative and Positive Tokens

In [91]:
plt.figure(figsize=(8,6))

ax = sns.regplot(x="negative", y="positive",fit_reg=False, scatter_kws={'alpha':0.5},data=tf)

plt.ylabel('Positive Frequency')
plt.xlabel('Negative Frequency')
plt.title('Negative Frequency vs Positive Frequency')

plt.show()

From the graph above, it doesn't seem like there is any meaningful relationship between positive and negative terms... But there's always a solution! Drawing inspiration from Jason Kessler in PyData 2017, he presented some metric used in his library Scattertext to extract meaningful tokens from the frequency data.

Since I didn't bother learning to use the library due to the lack of documentation, I decided to implement the metric underlying the library

We'll try to find a more meaningful metric that will allow us to characterise important tokens in each class. Intuitively, the frequency of a word in one class over the other can be a reasonable measure of how important the word is in characterizing the class. First, we'll define the positive rate as $$\text{pos_rate} = \frac{\text{frequency of a positive word }}{\text{total occurence of a word (in positive and negative reviews)}}$$

In [110]:
tf['pos_rate'] = tf['positive']/tf['total']
tf.sort_values(by='pos_rate', ascending=False).head(5)
Out[110]:
negative positive total pos_rate
addicting 87 1820 1907 0.954379
gem 1495 23330 24825 0.939778
unassuming 131 1902 2033 0.935563
cutest 99 1374 1473 0.932790
perfection 1406 18455 19861 0.929208

As we can see, words with the highest positive rate metric has relatively very low frequency in negative reviews. Also, the overall frequency of these words also makes it unlikely that we can use it as a reliable metric for positive reviews.

Alternatively, another metric is the frequency a words occurs in the class. This is defined as $${\text{Percentage of positive frequency}} = \frac {\text{frequency of a positive word}} {\Sigma \text{ positive words}}$$

In [111]:
tf['pos_freq_pct'] = tf['positive']/tf['positive'].sum()
tf.sort_values(by='pos_rate', ascending=False).head(5)
Out[111]:
negative positive total pos_rate pos_freq_pct
addicting 87 1820 1907 0.954379 0.000029
gem 1495 23330 24825 0.939778 0.000375
unassuming 131 1902 2033 0.935563 0.000031
cutest 99 1374 1473 0.932790 0.000022
perfection 1406 18455 19861 0.929208 0.000296

Since the positive frequency percentage is just scaled over the total sum of positive words, it has the same rank as the positive rate.

So, to come up with an effective metric that reflects both positive rate and positive frequency percentage, we will utilize harmonic mean instead of the arithmetic mean together with the CDF(Cumulative Distribution Function) of both positive rate and positive frequency rate. CDF can be explained as "distribution function of X, evaluated at x, is the probability that X will take a value less than or equal to x". The harmonic mean for our purpose helps us to draw a relationship between two rates or ratios of different units such as our two measures above that differs only in their denominator. The harmonic mean H of a positive real number is defined as: $${H} = \frac {n}{\sum_{i=1}^{n}\ \frac{1}{x_i}}$$

As we can see below, the word "delicious" has a pos_rate_normcdf of 0.994893 and pos_freq_pct_normcdf of 1. This means that roughly 99.49% of the tokens will take a pos_rate value less or equal to 0.893093 and 100% will take pos_freq_pct value of 0.004560 or less.

We see that pos_normcdf_hmean metric provides a more meaningful measure of how important a word is within the class! Next, we'll apply the same calculation to negative words.

In [112]:
from scipy.stats import norm
from scipy.stats import hmean

def normcdf(x):
    return norm.cdf(x, x.mean(), x.std())

#cdf of positive rate
tf['pos_rate_normcdf'] = normcdf(tf['pos_rate'])

#cdf of the positive rate frequency
tf['pos_freq_pct_normcdf'] = normcdf(tf['pos_freq_pct'])

#harmonic mean of the cdf of positive rate and cdf of positive rate frequency
tf['pos_normcdf_hmean'] = hmean([tf['pos_rate_normcdf'], tf['pos_freq_pct_normcdf']])

tf.sort_values(by='pos_normcdf_hmean', ascending=False).iloc[:10]
Out[112]:
negative positive total pos_rate pos_freq_pct pos_rate_normcdf pos_freq_pct_normcdf pos_normcdf_hmean
delicious 30294 253073 283367 0.893093 0.004063 0.991599 1.000000 0.995782
perfect 16058 114490 130548 0.876995 0.001838 0.989492 0.999992 0.994715
amazing 38271 270966 309237 0.876241 0.004351 0.989383 1.000000 0.994663
excellent 21393 137053 158446 0.864982 0.002200 0.987634 1.000000 0.993778
fantastic 10721 77291 88012 0.878187 0.001241 0.989663 0.997711 0.993671
awesome 24298 153224 177522 0.863127 0.002460 0.987323 1.000000 0.993621
highly 20568 111007 131575 0.843679 0.001782 0.983633 0.999985 0.991742
wonderful 14176 78948 93124 0.847773 0.001268 0.984479 0.998143 0.991264
favorite 23606 119468 143074 0.835008 0.001918 0.981713 0.999997 0.990771
loved 19659 94419 114078 0.827671 0.001516 0.979941 0.999783 0.989762
In [113]:
tf['neg_rate'] = tf['negative']/tf['total']
tf['neg_freq_pct'] = tf['negative']/tf['negative'].sum()
tf['neg_rate_normcdf'] = normcdf(tf['neg_rate'])
tf['neg_freq_pct_normcdf'] = normcdf(tf['neg_freq_pct'])
tf['neg_normcdf_hmean'] = hmean([tf['neg_rate_normcdf'], tf['neg_freq_pct_normcdf']])
tf.sort_values(by='neg_normcdf_hmean', ascending=False).iloc[:10]
Out[113]:
negative positive total pos_rate pos_freq_pct pos_rate_normcdf pos_freq_pct_normcdf pos_normcdf_hmean neg_rate neg_freq_pct neg_rate_normcdf neg_freq_pct_normcdf neg_normcdf_hmean
worst 166038 3872 169910 0.022789 0.000062 0.017358 0.462549 0.033460 0.977211 0.001944 0.982642 1.000000 0.991245
rude 163946 5956 169902 0.035056 0.000096 0.020271 0.495666 0.038948 0.964944 0.001920 0.979729 0.999999 0.989761
horrible 141940 5296 147236 0.035969 0.000085 0.020503 0.485164 0.039344 0.964031 0.001662 0.979497 0.999984 0.989634
terrible 123125 5771 128896 0.044773 0.000093 0.022863 0.492721 0.043698 0.955227 0.001442 0.977137 0.999824 0.988351
poor 86689 5940 92629 0.064127 0.000095 0.028862 0.495411 0.054546 0.935873 0.001015 0.971138 0.992604 0.981754
told 448839 51868 500707 0.103590 0.000833 0.045169 0.965695 0.086301 0.896410 0.005255 0.954831 1.000000 0.976894
manager 221963 28819 250782 0.114917 0.000463 0.051023 0.816298 0.096042 0.885083 0.002599 0.948977 1.000000 0.973821
paid 112818 16207 129025 0.125611 0.000260 0.057089 0.654737 0.105021 0.874389 0.001321 0.942911 0.999428 0.970347
money 198866 30033 228899 0.131206 0.000482 0.060482 0.828889 0.112738 0.868794 0.002328 0.939518 1.000000 0.968816
asked 365595 55461 421056 0.131719 0.000890 0.060800 0.975255 0.114465 0.868281 0.004281 0.939200 1.000000 0.968647

Let's check out the CDF Harmonic mean of positive and negative reviews

In [114]:
plt.figure(figsize=(11,8))

ax = sns.regplot(x="neg_normcdf_hmean", y="pos_normcdf_hmean",fit_reg=False, scatter_kws={'alpha':0.5},data=tf)
plt.ylabel('Positive Rate and Frequency CDF Harmonic Mean', size=14)
plt.xlabel('Negative Rate and Frequency CDF Harmonic Mean',size=14)
plt.title('neg_normcdf_hmean VS pos_normcdf_hmean')

plt.show()
In [115]:
import bokeh.io
from bokeh.plotting import figure
from bokeh.io import output_notebook, show
from bokeh.resources import INLINE
from bokeh.models import LinearColorMapper
from bokeh.models import HoverTool
from bokeh.resources import INLINE

output_notebook()
color_mapper = LinearColorMapper(palette='Magma256', low=min(tf.pos_normcdf_hmean), high=max(tf.pos_normcdf_hmean))
p = figure(x_axis_label='neg_normcdf_hmean', y_axis_label='pos_normcdf_hmean')
p.circle('neg_normcdf_hmean','pos_normcdf_hmean',size=5,alpha=0.5,source=tf,color={'field': 'pos_normcdf_hmean', 'transform': color_mapper})
hover = HoverTool(tooltips=[('token','@index')])
p.add_tools(hover)
show(p)
Loading BokehJS ...